In [ ]:
import numpy as np
import pandas as pd
from datetime import datetime

import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'notebook'

from pyspark.sql import SparkSession
In [ ]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
In [ ]:
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName('SPL_Checkouts_Milestone') \
    .getOrCreate() 

sc = spark.sparkContext
In [ ]:
df = spark.read.csv("2010-20_spl_physical_checkouts.csv", header=True, inferSchema= True)

spl = df.withColumn('CheckoutMonth', regexp_extract('CheckoutDateTime', r"([0-9\-]+)", 1))
spl = spl.withColumn('CheckoutDate', substring('CheckoutDateTime', 4, 2)) 
spl = spl.withColumn("Subjects", split(col("Subjects"), " "))
spl.show(1)
+--------------------+------------+---------+-----------+--------+----------+-------------------+-------------+----------------+--------------------+-------------+------------+
|                  ID|CheckoutYear|BibNumber|ItemBarcode|ItemType|Collection|         CallNumber|    ItemTitle|        Subjects|    CheckoutDateTime|CheckoutMonth|CheckoutDate|
+--------------------+------------+---------+-----------+--------+----------+-------------------+-------------+----------------+--------------------+-------------+------------+
|20100128110900001...|        2010|  1287425|10027511244|    acbk|      nanf|641.822 WILLAN 1992|Perfect pasta|[Cookery, Pasta]|01/28/2010 11:09:...|           01|          28|
+--------------------+------------+---------+-----------+--------+----------+-------------------+-------------+----------------+--------------------+-------------+------------+
only showing top 1 row

In [ ]:
spl.createOrReplaceTempView('spl')

spl_view = spark.sql("SELECT CheckoutDate, CheckoutMonth, CheckoutYear, Collection, ItemType, ItemTitle, Subjects FROM spl")

spl_view.createOrReplaceTempView('spl_view')

spl_view.columns
Out[ ]:
['CheckoutDate',
 'CheckoutMonth',
 'CheckoutYear',
 'Collection',
 'ItemType',
 'ItemTitle',
 'Subjects']
In [ ]:
ils = pd.read_csv("Integrated_Library_System__ILS__Data_Dictionary.csv")
ils.set_index("Code", inplace=True)
ils.head()
Out[ ]:
Description Code Type Format Group Format Subgroup Category Group Category Subgroup Age Group
Code
cazover CA7-zine collection oversize ItemCollection Print Book Periodical NaN Adult
caziner CA7-zine collection reference ItemCollection Print Book Periodical NaN Adult
cazval CA7-zine collection valuable mat. ItemCollection Print Book Periodical NaN Adult
nga Northgate Branch Location NaN NaN NaN NaN NaN
hip High Point Branch Location NaN NaN NaN NaN NaN
In [ ]:
spl_yearly = spark.sql("""
    SELECT CheckoutYear, ItemType, COUNT(*) as NumCheckouts
    FROM spl_view 
    GROUP BY CheckoutYear, ItemType
    ORDER By NumCheckouts DESC;
""")
spl_yearly_df = spl_yearly.toPandas()
spl_yearly_df.head()
Out[ ]:
CheckoutYear ItemType NumCheckouts
0 2010 acbk 2649827
1 2011 acbk 2486120
2 2013 acbk 2485961
3 2010 acdvd 2396572
4 2012 acbk 2319616
In [ ]:
spl_yearly_df['Item Type'] = spl_yearly_df['ItemType'].apply(lambda t: "Other" if t==np.nan else ils.loc[t]['Description'])
spl_yearly_df['Age Group'] = spl_yearly_df['ItemType'].apply(lambda t: "Other" if t==np.nan else ils.loc[t]['Age Group'])
spl_yearly_df['Format'] = spl_yearly_df['ItemType'].apply(lambda t: "Other" if t==np.nan else ils.loc[t]['Format Subgroup'])
spl_yearly_df['FormatType'] = spl_yearly_df['ItemType'].apply(lambda t: "Other" if t==np.nan else ils.loc[t]['Format Group'])
spl_yearly_df['Category'] = spl_yearly_df['ItemType'].apply(lambda t: "Other" if t==np.nan else ils.loc[t]['Category Group'])
spl_yearly_df.rename({"NumCheckouts": "Number of Checkouts"}, axis=1, inplace=True)
spl_yearly_df.head()
Out[ ]:
CheckoutYear ItemType Number of Checkouts Item Type Age Group Format FormatType Category
0 2010 acbk 2649827 Book: Adult/YA Adult Book Print Miscellaneous
1 2011 acbk 2486120 Book: Adult/YA Adult Book Print Miscellaneous
2 2013 acbk 2485961 Book: Adult/YA Adult Book Print Miscellaneous
3 2010 acdvd 2396572 DVD: Adult/YA Adult Video Disc Media Miscellaneous
4 2012 acbk 2319616 Book: Adult/YA Adult Book Print Miscellaneous
In [ ]:
fig1 = px.bar(spl_yearly_df.fillna("Other"), 
            x="CheckoutYear", 
            y="Number of Checkouts", 
            color="Format",
            color_discrete_sequence=["forestgreen", "goldenrod", "tomato", "turquoise", 
            "indigo", "black", "blue", "orange", 
            "brown", "black", "teal", "grey", "orchid"],
            facet_col="Age Group",
            height=750, width=1000,
            title="Seattle Public Library Physical Checkout Format (2010-2020)")
fig1.show()
In [ ]:
fig2 = px.bar(spl_yearly_df, 
            x="CheckoutYear", 
            y="Number of Checkouts", 
            color="Age Group",
            color_discrete_sequence=["goldenrod", "teal"],
            height=500, width=750,
            title="Seattle Public Library Physical Checkouts (2010-2020)")
fig2.show()
In [ ]:
spl_yearly_equiptment_df = spl_yearly_df[spl_yearly_df.FormatType == 'Equipment']
spl_yearly_equiptment_df.head()
Out[ ]:
CheckoutYear ItemType Number of Checkouts Item Type Age Group Format FormatType Category
75 2018 alaptop 14375 Laptop: Hourly Circulating Adult NaN Equipment Miscellaneous
86 2019 alaptop 9809 Laptop: Hourly Circulating Adult NaN Equipment Miscellaneous
92 2018 aceq 7831 Equipment: Adult/YA Adult NaN Equipment Miscellaneous
93 2019 aceq 7746 Equipment: Adult/YA Adult NaN Equipment Miscellaneous
94 2016 aceq 7066 Equipment: Adult/YA Adult NaN Equipment Miscellaneous
In [ ]:
fig3 = px.bar(spl_yearly_equiptment_df, 
            y="CheckoutYear", 
            x="Number of Checkouts", 
            color="Item Type",
            color_discrete_sequence=["goldenrod", "olive", 
            "indigo", "brown", "teal", "grey", "red"],
            facet_col="Age Group",
            orientation='h',
            height=600,
            barmode="stack",
            title="Seattle Public Library Physical Equipment Checkout (2010-2020)")
fig3.show()
In [ ]:
spl_collections_yearly = spark.sql("""
    SELECT CheckoutYear, CheckoutMonth, Collection, COUNT(*) as NumCheckouts
    FROM spl_view 
    GROUP BY CheckoutYear, CheckoutMonth, Collection
    ORDER By NumCheckouts DESC;
""")
spl_collections_yearly = spl_collections_yearly.toPandas()
spl_collections_yearly.head()
Out[ ]:
CheckoutYear CheckoutMonth Collection NumCheckouts
0 2010 03 nadvd 170502
1 2010 01 nadvd 169509
2 2010 06 nadvd 155156
3 2010 04 nadvd 153769
4 2010 05 nadvd 149921
In [ ]:
spl_collections_yearly['Collection Type'] = spl_collections_yearly['Collection'].apply(lambda t: "Other" if t==np.nan else str(ils.loc[t]['Description']))
spl_collections_yearly = spl_collections_yearly.rename({"NumCheckouts": "Number of Checkouts"}, axis=1).dropna()
spl_collections_yearly.head()
Out[ ]:
CheckoutYear CheckoutMonth Collection Number of Checkouts Type Collection Type
0 2010 03 nadvd 170502 NA-DVD, Fiction NA-DVD, Fiction
1 2010 01 nadvd 169509 NA-DVD, Fiction NA-DVD, Fiction
2 2010 06 nadvd 155156 NA-DVD, Fiction NA-DVD, Fiction
3 2010 04 nadvd 153769 NA-DVD, Fiction NA-DVD, Fiction
4 2010 05 nadvd 149921 NA-DVD, Fiction NA-DVD, Fiction
In [ ]:
fig4 = px.bar(spl_collections_yearly, 
            x="CheckoutYear", 
            y="Number of Checkouts", 
            color="Collection Type",
            color_discrete_sequence=px.colors.qualitative.G10,
            # color_discrete_sequence=["goldenrod", "olive", 
            # "indigo", "brown", "teal", "grey", "red"],
            height=1000, width=2500,
            title="Seattle Public Library Collection Circulation By Year (2010-2020)")
fig4.show()
In [ ]:
spl_collections_monthly = spark.sql("""
    SELECT CheckoutMonth, Collection, COUNT(Collection) as NumCheckouts
    FROM spl_view 
    GROUP BY CheckoutMonth, Collection
    ORDER By NumCheckouts DESC;
""")
spl_collections_monthly = spl_collections_monthly.toPandas()
spl_collections_monthly.head()
Out[ ]:
CheckoutMonth Collection NumCheckouts
0 01 nadvd 1129150
1 03 nadvd 1097255
2 02 nadvd 1027768
3 04 nadvd 1003426
4 07 nadvd 980306
In [ ]:
spl_collections_monthly['Collection Type'] = spl_collections_monthly['Collection'].apply(lambda t: "Other" if t==np.nan else str(ils.loc[t]['Description']))
spl_collections_monthly = spl_collections_monthly.rename({"NumCheckouts": "Number of Checkouts"}, axis=1).dropna()

spl_collections_monthly['CheckoutMonth'] = spl_collections_monthly['CheckoutMonth'].apply(lambda t: int(t))

spl_collections_monthly.head()
Out[ ]:
CheckoutMonth Collection Number of Checkouts Collection Type
0 1 nadvd 1129150 NA-DVD, Fiction
1 3 nadvd 1097255 NA-DVD, Fiction
2 2 nadvd 1027768 NA-DVD, Fiction
3 4 nadvd 1003426 NA-DVD, Fiction
4 7 nadvd 980306 NA-DVD, Fiction
In [ ]:
fig5 = px.bar(spl_collections_monthly, 
            x="CheckoutMonth", 
            y="Number of Checkouts", 
            color="Collection Type",
            color_discrete_sequence=px.colors.qualitative.G10,
            height=1000, width=2500,
            title="Seattle Public Library Collection Circulation By Month (2010-2020)")

fig5.show()